# filter companies for blue-green deployment

=begin

files:
proxy_access.log.2020-09-04-15.hz-api-gw002.gz
proxy_access.log.2020-09-04-15.hz-api-gw003.gz
proxy_access.log.2020-09-04-15.hz-api-gw004.gz
proxy_access.log.2020-09-04-15.hz-api-gw005.gz

bash commands

# zgrep -Eo ' "[[:digit:]]+" ' proxy_access.log.2020-09-04-15.hz-api-gw002.gz | grep -Eo '[[:digit:]]+' > /tmp/tids_stat_2020_09_04_002.log
# zgrep -Eo ' "[[:digit:]]+" ' proxy_access.log.2020-09-04-15.hz-api-gw003.gz | grep -Eo '[[:digit:]]+' > /tmp/tids_stat_2020_09_04_003.log
# zgrep -Eo ' "[[:digit:]]+" ' proxy_access.log.2020-09-04-15.hz-api-gw004.gz | grep -Eo '[[:digit:]]+' > /tmp/tids_stat_2020_09_04_004.log
# zgrep -Eo ' "[[:digit:]]+" ' proxy_access.log.2020-09-04-15.hz-api-gw005.gz | grep -Eo '[[:digit:]]+' > /tmp/tids_stat_2020_09_04_005.log


# cat tids_stat_2020_09_04_002.log /tmp/tids_stat_2020_09_04_003.log /tmp/tids_stat_2020_09_04_004.log /tmp/tids_stat_2020_09_04_005.log > /tmp/tids_stat_0904.log


# wc -l /tmp/tids_stat_0904.log    # 16584399

# sort -n /tmp/tids_stat_0904.log | uniq -c | sort -nr > /tmp/freq.log

=end


def get_candidates(file_name: )
  total = 16584399

  freq_by_tid = File.foreach(file_name).each_with_object({}) do |line, h|
    v, k = line.chomp.lstrip.split(" ").map(&:to_i)
    h[k] = v
  end

  ratio_by_tid = freq_by_tid.each_with_object({}) do |(k, v), h|
    h[k] = v.fdiv(total)
  end

  hash = ratio_by_tid.select { |_, v| v > 0.0001 }
  hash = hash.reject { |k, _| k == 0 }
  candidates = hash.reject { |k, _| block_ids.include?(k) }

  detailed_candidates = candidates.each_with_object({}) do |(tid, ratio), h|
    on_shard_by_company_id(tid) do
      com = Company.find(tid)
      h[tid] = {}
      h[tid]["subdomain"] = com.subdomain
      h[tid]["ratio"] = ratio * 100
      h[tid]["shard_id"] = ActiveRecord::Base.current_shard_id
      h[tid]["company_name"] = com.company_name
      h[tid]["agent_count"] = com.agents.count
      h[tid]["tid"] = tid
    end
  end

  de = detailed_candidates.select { |_, v| v['agent_count'] < 50 }
  de = de.select { |_, v| v['ratio'] > 0.02 }

  de.values.each do |h|
    puts "#{h['tid']},#{h['subdomain']},#{h['shard_id']},#{h['company_name']},#{h['ratio']},#{h['agent_count']}"
  end

  de

  first_10_percent_tenants = select_first_10_percent(de)

  puts "First 10 percent:"
  print_list(first_10_percent_tenants)

  remain_40_percent = de.reject { |k, v| first_10_percent_tenants.map { |v| v['tid'] }.include?(k) }

  another_20_percent_tenants = select_20_percent(remain_40_percent)

  puts "another 20 percent:"
  print_list(another_20_percent_tenants)

  remain = remain_40_percent.reject { |k, v| another_20_percent_tenants.map { |v| v['tid'] }.include?(k) }.values
  puts "yet another 20 percent:"
  print_list(remain)
end

def select_20_percent(h)
  results = []
  h.values.first(150).shuffle.each do |h|
    if results.inject(0) { |r, h| r + h["ratio"] } <= 20
      results << h
    end
  end
  results
end

def select_first_10_percent(de)
  results = []
  de.values.first(30).shuffle.each do |h|
    if results.inject(0) { |r, h| r + h["ratio"] } <= 10
      results << h
    end
  end
  results
end

def print_list(array)
  puts
  puts "----"

  array.each do |h|
    puts "#{h['tid']},#{h['subdomain']},#{h['shard_id']},#{h['company_name']},#{h['ratio']},#{h['agent_count']}"
  end

  puts "----"
  puts
end

def key_ids
  %w(
    1660
    2449
    4166
    4941
    5771
    5899
    6009
    7350
    7931
    9979
    11508
    11708
    13300
    14872
    18877
    20557
    20777
    21024
    23648
    23844
    23846
    23926
    24090
    24613
    24851
    24973
    26706
    26741
    27085
    27441
    28253
    28270
    28659
    29163
    29480
    30010
    30109
    30277
    30334
    30575
    30818
    31282
    31485
    31939
    32034
    32425
    32859
    33748
    34004
    34754
    34954
    35091
    35190
    35197
    35585
    35670
    35826
    36215
    36280
    36837
    36981
    37151
    37261
    37433
    38059
    38210
    38391
    38445
    38453
    39200
    40274
    40557
    41344
    42853
    43654
    44024
    44882
    44935
    45516
    45528
    46254
    46808
    46880
    47249
    47649
    48067
    48738
    49467
    49845
    49965
    52074
    53036
    57114
    58354).map(&:to_i)
end

def other_key_ids
  [49280,
  47684,
  29362,
  41003,
  56533,
  62578,
  59164,
  36672,
  39649,
  61178,
  32569,
  41726,
  39542,
  57792,
  55201,
  58995,
  41748,
  40611,
  9041,
  32798,
  49424,
  39328,
  45997,
  46001,
  46639,
  56353,
  40063,
  36969,
  26661,
  42151,
  41298,
  34779,
  9198,
  46991,
  44445,
  44564,
  39958,
  40753,
  27429,
  44314,
  33125,
  30377,
  9304,
  2430,
  47976,
  49948,
  36534,
  48233,
  46291,
  56813,
  45259,
  18582,
  49245,
  21430,
  54203
  ]
end

def special_ids
  %w(
    1
    26
    51159
    5427
    6322
    20196).map(&:to_i)
end

def block_ids
  key_ids + other_key_ids + special_ids + further_block_ids
end

def further_block_ids
  [
    63231, # star-services
    79821, # dorreta
    76311, # nbdeli
    73341, # juzifenqi919juzifenqi
    84921, # cfca11
    42731, # qxdaojia
    59882, # gq
    77901, # holiland
    36672, # baotuwang
    45259, # kuaidi100
    56813, # lining
    63231, # star-services
    72891, # bosideng
    77901, # holiland
    46837, # 400
    52002, # xiaoyibao
    52763, # erongsheng
    30818, # wuxiapptech-it
    24973, # pkfare
    5771,  # joyowo
    37433, # 21cake
    76211, # xiaobang
    48067, # wdmcake
    20777, # bdkj
    4166,  # yyxueche
    41344, # yinxiang
    9041,  # labnetwork
    38995, # letote
    16383, # starbuckschinapcc
    29723, # msccruises,1,地中海邮轮船务（上海）有限公司,0.2663383092004336,16
    83221, # 1816057	1	上海微盟企业发展有限公司	0.676632884	30
    48712, # 13570269686	2	广州赫基信息科技有限公司	0.06049472	25
    41192, # lakala
    83681, # wm-motor	1	威马汽车科技集团有限公司	0.067819817	17
    40522, # xhsd	2	新华互联电子商务有限责任公司	0.227830022	38
    49036, # gpmartyy	1	北京用友政府软件有限公司	0.323451786	2
    77871, # cifi01	1	旭辉集团股份有限公司	0.091616635	19
    86071, # easi	1	EASI Australia Pty Ltd	0.185617408	48
    71571, # shczt	1	上海春芝堂生物制品有限公司	0.103437058	25
    31664, # tslsmart	1	特斯联（北京）科技有限公司	0.064048645	9
    73951, # banggood	1	广州棒谷科技股份有限公司	0.273189921	45
    44411, # jimmymove	2	深圳市吉米生活服务有限公司	0.201264707	30
    48840 # mokahr	1	北京希瑞亚斯科技有限公司	0.340842626	15
  ] + [4372, 59180, 81451, 16, 80951, 96441, 71471, 84951, 54411, 58975, 92961, 95201, 63661] + [63081, 62642]

  #   {4372=>{"subdomain"=>"danke", "ratio"=>0.8720653654917173, "shard_id"=>1, "company_name"=>"紫梧桐（北京）资产管理有限公司", "agent_count"=>50, "tid"=>4372},
  #  59180=>{"subdomain"=>"vvic", "ratio"=>0.5959621167135881, "shard_id"=>3, "company_name"=>"广州八爪鱼商务服务有限公司", "agent_count"=>64, "tid"=>59180},
  #  81451=>{"subdomain"=>"tsttst1", "ratio"=>0.44265869280157394, "shard_id"=>1, "company_name"=>"上海达尔威贸易有限公司", "agent_count"=>75, "tid"=>81451},
  #  16=>{"subdomain"=>"inspur", "ratio"=>0.18252025681846792, "shard_id"=>1, "company_name"=>"浪潮电子信息产业股份有限公司", "agent_count"=>50, "tid"=>16},
  #  80951=>{"subdomain"=>"yilian", "ratio"=>0.17366886549561664, "shard_id"=>1, "company_name"=>"成都医云科技有限公司", "agent_count"=>59, "tid"=>80951},
  #  96441=>{"subdomain"=>"agile400", "ratio"=>0.09106516384265134, "shard_id"=>1, "company_name"=>"天津雅居乐企业管理服务有限公司", "agent_count"=>80, "tid"=>96441},
  #  71471=>{"subdomain"=>"qddfhr", "ratio"=>0.08703218252248499, "shard_id"=>1, "company_name"=>"清大东方人力资源有限公司", "agent_count"=>63, "tid"=>71471},
  #  84951=>{"subdomain"=>"dongfangyuhong", "ratio"=>0.08641386632560866, "shard_id"=>1, "company_name"=>"北京东方雨虹防水技术股份有限公司", "agent_count"=>68, "tid"=>84951},
  #  54411=>{"subdomain"=>"kwg", "ratio"=>0.07921688861061565, "shard_id"=>1, "company_name"=>" 广州礼和置业发展有限公司", "agent_count"=>77, "tid"=>54411},
  #  58975=>{"subdomain"=>"student", "ratio"=>0.07754576375419313, "shard_id"=>2, "company_name"=>" 江曲投资咨询（上海）有限公司（学旅家）", "agent_count"=>54, "tid"=>58975},
  #  92961=>{"subdomain"=>"htdkgroup", "ratio"=>0.05629462599668672, "shard_id"=>1, "company_name"=>"华瑭大昌商业（上海）有限公司HTDK", "agent_count"=>55, "tid"=>92961},
  #  95201=>{"subdomain"=>"semir", "ratio"=>0.050551526906781316, "shard_id"=>1, "company_name"=>"浙江森马服饰股份有限公司", "agent_count"=>60, "tid"=>95201},
  #  63661=>{"subdomain"=>"hongthai", "ratio"=>0.03865868834524103, "shard_id"=>1, "company_name"=>"康泰旅行社有限公司", "agent_count"=>50, "tid"=>63661}}
  #  63081,vandream,3,万郡绿建科技股份有限公司,0.041822318339188515,11
  #  62642,kwm,1,北京市金杜律师事务所,0.054126664094814925,10
end

get_candidates(file_name: "/tmp/freq.log")
